{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Explore\n",
    "\n",
    "Reference：\n",
    "* http://jakeboxer.com/blog/2009/12/13/the-knuth-morris-pratt-algorithm-in-my-own-words/\n",
    "* http://www.ruanyifeng.com/blog/2013/05/Knuth–Morris–Pratt_algorithm.html"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import string\n",
    "import random\n",
    "import time"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "sample_pattern = 'abcdabd'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(2, 'ab', 0, ['a'], ['b'])\n",
      "[0, 0]\n",
      "(3, 'abc', 0, ['a', 'ab'], ['bc', 'c'])\n",
      "[0, 0, 0]\n",
      "(4, 'abcd', 0, ['a', 'ab', 'abc'], ['bcd', 'cd', 'd'])\n",
      "[0, 0, 0, 0]\n",
      "(5, 'abcda', 1, ['a', 'ab', 'abc', 'abcd'], ['bcda', 'cda', 'da', 'a'])\n",
      "[0, 0, 0, 0, 1]\n",
      "(6, 'abcdab', 2, ['a', 'ab', 'abc', 'abcd', 'abcda'], ['bcdab', 'cdab', 'dab', 'ab', 'b'])\n",
      "[0, 0, 0, 0, 1, 2]\n",
      "(7, 'abcdabd', 0, ['a', 'ab', 'abc', 'abcd', 'abcda', 'abcdab'], ['bcdabd', 'cdabd', 'dabd', 'abd', 'bd', 'd'])\n",
      "[0, 0, 0, 0, 1, 2, 0]\n"
     ]
    }
   ],
   "source": [
    "cur_prefix = []\n",
    "cur_suffix = []\n",
    "partial_match_value = [0]\n",
    "for i in range(2,len(sample_pattern)+1):\n",
    "    partial_pattern = sample_pattern[:i]\n",
    "    head = partial_pattern[-2]\n",
    "    tail = partial_pattern[-1]\n",
    "    if len(cur_prefix) == 0:\n",
    "        cur_prefix.append(head)\n",
    "    else:\n",
    "        p = cur_prefix[-1]\n",
    "        new_p = p+head\n",
    "        cur_prefix.append(new_p)\n",
    "    \n",
    "    new_suffix = []\n",
    "    for s in cur_suffix:\n",
    "        new_s = s+tail\n",
    "        new_suffix.append(new_s)\n",
    "    new_suffix.append(tail)\n",
    "    cur_suffix = new_suffix\n",
    "    \n",
    "    max_overlap = 0\n",
    "    for st in set.intersection(set(cur_suffix),set(cur_prefix)):\n",
    "        if len(st)>max_overlap:\n",
    "            max_overlap = len(st)\n",
    "    partial_match_value.append(max_overlap)\n",
    "    print(i,partial_pattern,max_overlap,cur_prefix,cur_suffix)\n",
    "    print(partial_match_value)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Function Develop\n",
    "\n",
    "Reference:\n",
    "\n",
    "https://www.matrix67.com/blog/archives/115"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "# this table means if j+1 can not match, how many steps backward.\n",
    "def partial_match_table_naive(pattern_st):\n",
    "    cur_prefix = []\n",
    "    cur_suffix = []\n",
    "    partial_match_value = [0]\n",
    "    for i in range(2,len(pattern_st)+1):\n",
    "        partial_pattern = pattern_st[:i]\n",
    "        head = partial_pattern[-2]\n",
    "        tail = partial_pattern[-1]\n",
    "        if len(cur_prefix) == 0:\n",
    "            cur_prefix.append(head)\n",
    "        else:\n",
    "            p = cur_prefix[-1]\n",
    "            new_p = p+head\n",
    "            cur_prefix.append(new_p)\n",
    "\n",
    "        new_suffix = []\n",
    "        for s in cur_suffix:\n",
    "            new_s = s+tail\n",
    "            new_suffix.append(new_s)\n",
    "        new_suffix.append(tail)\n",
    "        cur_suffix = new_suffix\n",
    "\n",
    "        max_overlap = 0\n",
    "        for st in set.intersection(set(cur_suffix),set(cur_prefix)):\n",
    "            if len(st)>max_overlap:\n",
    "                max_overlap = len(st)\n",
    "        partial_match_value.append(max_overlap)\n",
    "    return partial_match_value\n",
    "\n",
    "def kmp_naive(sentence, pattern_st):\n",
    "    # return the position of the first match, if no, then -1\n",
    "    p = partial_match_table_naive(pattern_st)\n",
    "    j = 0\n",
    "    for i in range(len(sentence)):\n",
    "        while j > 0 and sentence[i] != pattern_st[j]:\n",
    "            j = p[j-1]\n",
    "        if sentence[i] == pattern_st[j]:\n",
    "            j += 1\n",
    "        if j == len(pattern_st):\n",
    "            return i-j+1\n",
    "    return -1\n",
    "    \n",
    "\n",
    "# this table means if j can not match, which position should j move to.\n",
    "def partial_match_table_god(pattern_st):\n",
    "    p = [0]\n",
    "    j = 0\n",
    "    for i in range(1,len(pattern_st)):\n",
    "        while j > 0 and pattern_st[i] != pattern_st[j]:\n",
    "            j = p[j-1]\n",
    "        if pattern_st[i] == pattern_st[j]:\n",
    "            j += 1\n",
    "        p.append(j)\n",
    "    return p\n",
    "\n",
    "def kmp(sentence, pattern_st):\n",
    "    # return the position of the first match, if no, then -1\n",
    "    p = partial_match_table_god(pattern_st)\n",
    "    j = 0\n",
    "    for i in range(len(sentence)):\n",
    "        while j > 0 and sentence[i] != pattern_st[j]:\n",
    "            j = p[j-1]\n",
    "        if sentence[i] == pattern_st[j]:\n",
    "            j += 1\n",
    "        if j == len(pattern_st):\n",
    "            return i-j+1\n",
    "    return -1\n",
    "\n",
    "def naive_match(haystack, needle):\n",
    "    n = len(haystack)\n",
    "    m = len(needle)\n",
    "    for i in range(n):\n",
    "        j = i\n",
    "        p = 0\n",
    "        while j<n and p<m and haystack[j] == needle[p]:\n",
    "            p += 1\n",
    "            j += 1\n",
    "        if p == m:\n",
    "            return i\n",
    "    return -1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[0, 1, 2, 0]"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "partial_match_table_god('llla')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "kmp_naive('hello','ll')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "-1"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "kmp('aaaaa','bba')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "5"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "kmp('aaaabbba','bba')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "5"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "naive_match('aaaabbba','bba')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 性能测试"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 随机数据匹配"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "sts = []\n",
    "pts = []\n",
    "n_iter = 10\n",
    "for iter in range(n_iter):\n",
    "    sts.append(''.join([random.choice(string.ascii_letters + string.digits) for n in xrange(10000000)]))\n",
    "    pts.append(''.join([random.choice(string.ascii_letters + string.digits) for n in xrange(2000)]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "('time used:', 20.494627952575684)\n"
     ]
    }
   ],
   "source": [
    "start_time = time.time()\n",
    "for i in range(n_iter):\n",
    "    st = sts[i]\n",
    "    pt = pts[i]\n",
    "    naive_match(st,pt)\n",
    "print('time used:',time.time()-start_time)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "('time used:', 20.881771087646484)\n"
     ]
    }
   ],
   "source": [
    "start_time = time.time()\n",
    "for iter in range(n_iter):\n",
    "    st = sts[i]\n",
    "    pt = pts[i]\n",
    "    kmp(st,pt)\n",
    "print('time used:',time.time()-start_time)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "结论：KMP 算法在随机数据匹配上，效果甚至不如朴素匹配算法，因为有额外的 partial_match_table 计算开销。"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 有关数据匹配"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "sts = []\n",
    "pts = []\n",
    "n_iter = 10000\n",
    "rept = 1000\n",
    "pt_len = 50\n",
    "for iter in range(n_iter):\n",
    "    pt = ''.join([random.choice(string.ascii_letters + string.digits) for n in xrange(pt_len)])\n",
    "    pts.append(pt)\n",
    "    st = ''\n",
    "    for r in range(rept):\n",
    "        rand_st = pt[:random.randint(1,pt_len)]\n",
    "        st += rand_st\n",
    "    sts.append(st)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "('time used:', 8.886002779006958)\n"
     ]
    }
   ],
   "source": [
    "start_time = time.time()\n",
    "for i in range(n_iter):\n",
    "    st = sts[i]\n",
    "    pt = pts[i]\n",
    "    naive_match(st,pt)\n",
    "print('time used:',time.time()-start_time)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "('time used:', 3.1221609115600586)\n"
     ]
    }
   ],
   "source": [
    "start_time = time.time()\n",
    "for iter in range(n_iter):\n",
    "    st = sts[i]\n",
    "    pt = pts[i]\n",
    "    kmp(st,pt)\n",
    "print('time used:',time.time()-start_time)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "结论：KMP 算法在相关数据匹配上，效果要好于朴素匹配算法，此时 partial_match_table 发挥了作用。复杂度 O(m+n)\n",
    "\n",
    "Reference:\n",
    "* https://blog.csdn.net/zhanghaiyang9999/article/details/40709247\n",
    "\n",
    "好了，我要去学 BM 算法了。"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "conda_py2",
   "language": "python",
   "name": "conda_py2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
